
########################################################
### functions to extract number mentions from speech ###
########################################################


#function "word_to_number" takes written-out numbers and transforms them to arabic numerals ####
word_to_number <- function(x){

  # separate into distinct words
  x <- strsplit(x, "\\s+")
  # Remove punctuation and 'and'
  for(i in length(unlist(x))){
    if(grepl("[a-zA-z]", x[[1]][i])){
      x[[1]][i] <- tolower(gsub("([[:punct:]]| and )", " ", x[[1]][i]))
      }
  }
  #remove extra spaces
  x <- lapply(X = x,
              FUN = trimws)
  
  coll <- checkmate::makeAssertCollection()
  x <- unlist(x)
  
  # verify that all words are found in the list of numbers and number words
  bad_text_in_x <- 
    vapply(X = x,
           FUN = function(k) (!all(k %in% names(c(word_to_number_reference, magnitude_reference)))),
           FUN.VALUE = logical(1))
  
  #warn about numbers that aren't in the reference
  #if (any(bad_text_in_x)){ 
  #  warning(sprintf("Unable to translate '%s'", 
  #                  paste0(x[is.na(num)], collapse = ', ')))
  #  #x <- NA
  #}
  
  checkmate::reportAssertions(coll)
  
  #run the word conversion function on each one
  #num <- vapply(X = x,
  #              FUN = word_to_number_single,
  #              FUN.VALUE = numeric(1))
  num <- word_to_number_single(x)
  
  #warn about numbers that can't be translated
  if (any(is.na(num)))  
    warning(sprintf("Unable to translate '%s'", 
                    paste0(x[is.na(num)], collapse = ', ')))
  
  num
}



# function "word_to_number_single" converts text processed by "word_to_number" into numbers ####

word_to_number_single <- function(x){
  # check the reference data for the arabic numeral that corresponds to each word
  num <- c(word_to_number_reference, magnitude_reference)[x]
  
  # Identify positions with a magnitude indicator
  magnitude_at <- 
    which(names(num) %in% 
            c("quadrillion", "trillion", "billion",
              "million", "thousand"))
  
  # Create an indexing vector for each magnitude class of the number
  magnitude_index <- 
    cut(seq_along(num), 
        breaks = unique(c(0, magnitude_at, length(num))))
  
  # Make a list with each magnitude
  num_component <- 
    lapply(unique(magnitude_index),
           FUN = function(i) num[magnitude_index == i])
  
  # Translate each component
  num_component <- 
    vapply(num_component,
           FUN = word_to_number_translate_hundred,
           FUN.VALUE = numeric(1))
  
  # Add the components together
  sum(num_component)
}

# function "word_to_number_translate_hundred" multiplies numbers by their order of magnitude ####

word_to_number_translate_hundred <- function(n){
  # set a magnitude multiplier for thousands and greater
  if (tail(names(n), 1) %in% names(magnitude_reference)){
    magnitude <- tail(n, 1)
    n <- head(n, -1)
  } else {
    magnitude <- 1
  }
  
  # if hundred appears anywhere but the second position or of the
  # value preceding hundred is greater than 9, handle with care
  # (for instance, 1200)
  #if(length(n)==0){
  #  return(0)
  #}
  if ( ("hundred" %in% names(n) && which(names(n) == "hundred") != 2) ||
       ("hundred" %in% names(n) && n[1] > 1) )
  {
    which_hundred <- which(names(n) == "hundred")
    (sum(n[seq_along(n) < which_hundred]) * 100 + 
        sum(n[seq_along(n) > which_hundred])) * magnitude
  } else {
    if(length(n)>=1){
    op <- rep("+", length(n) - 1)
    op[names(n)[-1] == "hundred"] <- "*"
    op <- c(op, "")
    eval(parse(text = paste(paste(n, op), collapse = " "))) * magnitude
    }
    else{NA}
  }
}

# reference data: key for text to arabic numerals ####
  # function checks this data to translate characters to numerals

# CORE REFERENCE NUMBERS

word_to_number_reference <- 
  c("zero" = 0,
    "one" = 1,
    "two" = 2,
    "three" = 3,
    "four" = 4,
    "five" = 5,
    "six" = 6,
    "seven" = 7,
    "eight" = 8,
    "nine" = 9,
    "ten" = 10,
    "eleven" = 11,
    "twelve" = 12,
    "thirteen" = 13,
    "fourteen" = 14,
    "fifteen" = 15,
    "sixteen" = 16,
    "seventeen" = 17,
    "eighteen" = 18,
    "nineteen" = 19,
    "twenty" = 20,
    "thirty" = 30,
    "forty" = 40,
    "fifty" = 50,
    "sixty" = 60,
    "seventy" = 70,
    "eighty" = 80,
    "ninety" = 90,
    "hundred" = 100)

nums <- c(seq(0, 10, .5), 11:1000, 10000, 100000, 112000, 2000000, 200000, 200000000, 250000, 300000, 400000, 500000, 5000000, 1000000, 12000000, 10000000)
names(nums) <- paste0(as.character(nums))
names(nums)[2] <- ".5"
word_to_number_reference <- c(word_to_number_reference, nums)

# MAGNITUDE REFERENCE NUMBERS

magnitude_reference <- 
  c("thousand" = 1000,
    "million" =  1e6,
    "billion" =  1e9,
    "trillion" = 1e12,
    "quadrillion" = 1e15)



# function "find_and_replace" looks through text and finds numbers to translate ####

find_and_replace <- function(x, ret="num"){
  if(is.na(x)){return(NA)}
  
  # list of basic number words, for use in identifying numbers in text
      numbers <- c( "\\bone " , "zero ", "  two " , "  three " , "  four " , "  five " , "  six " , "  seven " , "  eight " , "  nine " , "  ten " , " eleven " , "  twelve " , "  thirteen " , "  fourteen " , "  fifteen " , "  sixteen " , "  seventeen " , "  eighteen " , "  nineteen " , "  twenty " , " thirty " , "  forty " , "  fifty " , "  sixty " , "  seventy " , "  eighty " , "  ninety " , "  hundred " , "  thousand " , "  million " , " billion")
      #remove  spaces
      numbers <- str_remove_all(numbers, " ")
      #add word barriers (count "one", not "someone")
      numbers <- paste0(numbers, collapse="\\b|\\b")
      #add digits
      numbers <- paste(numbers, "|^[0-9]*$")
    
  #test 
  #x <- dat$text1[dat$jurynum==183&!is.na(dat$jurynum)]
  
  #initial cleaning
      #remove times (e.g. 4:00)
      x <- str_remove_all(x, "[:digit:]:[:digit:][:digit:]")
      #replace "," with "and" to separate lists of numbers
      x <- str_replace_all(x, ", ", " and ")
      #replace special character half with .5
      #x <- str_replace_all(x, "?", ".5")
      x <- str_replace_all(x, "and a half", ".5")
      
      #remove punctuation
      x <- tolower(gsub("([#$'!?\\:])", " ", x))
      x <- tolower(str_replace_all(x, "\\-|_", " "))
      x <- str_remove_all(x, "\\.\\.")
      
      #add some colloquialisms
      x <- str_replace_all(x, "a hundred", "100")
      x <- str_replace_all(x, "a thousand", "1000")
      x <- str_replace_all(x, "a hundred thousand", "100000")
      x <- str_replace_all(x, "a million", "one million")
      x <- str_replace_all(x, "couple", "two")
      x <- str_replace_all(x, "grand", "thousand")
      x <- str_replace_all(x, "\\bmill\\b|\\bmil\\b", "million")
      x <- str_replace_all(x, "half a million", "500000")
      x <- str_replace_all(x, "seven fifty", "750")
      x <- str_replace_all(x, "two fifty", "250")
      
      #remove phrases that are not preference expressions
      x <- str_replace_all(x, paste0("(", numbers, ") (\\bof\\b|\\bat\\b|year|person\\b|people|minute|vote)"), "blah")
      x <- str_remove_all(x, "a group one|\\band\\b|a \\b[a-z|0-9]*\\b one|an \\b[a-z|0-9]*\\b one|another one")
      
  #split into words at spaces
  x <- strsplit(x, "\\s+")
  #trim whitespace
  x <- lapply(X = x,
              FUN = trimws)
  #remove commas now that words are separated
  x[[1]] <- str_remove_all(x[[1]], ",")
  
  ## identify and fix cases like "10 to 20 thousand"
      #create indexed list of values
      r <- rle(unlist(x))
      vals <- r$values
      #find the numbers in the list
      x0 <- which(grepl(numbers, vals)) # index positions of numbers
      #find things that match the pattern "x or y million" or "x to y million" (or hundred or thousand)
      xt <- (vals[x0+1] %in% c("to", "or"))&(grepl(numbers, vals[x0+2]))&(grepl("hundred|thousand|million", vals[x0+3]))
      #take the magnitude of the second number and apply it to the first
      k <- 0
      if(sum(xt)>0){
        for(i in 1:sum(xt)){
          r$values <- append(r$values, vals[(x0[xt])+3][i], after=x0[xt][i]+k)
          k <- k + 1
        }
      }
      r$lengths <- rep(1, length(r$values))
      #send modified versions back out 
      x <- list(inverse.rle(r))
    
  #group together words belonging to a single number (e.g. "one million")
      #remove periods from written numbers (but not arabic ones--keep decimals)
      num <- grepl(numbers, str_remove(x[[1]], "\\."))
      #replace all non-number words with #
      x[[1]][num==FALSE] <- "#"
      #re-collapse words into string
      x <- paste0(x[[1]], collapse=" ")
      #split string into items at #, keeping consecutive number words together but splitting from 
        #non-consecutive number words
      x <- str_split(x, "#")
      #trim white space
      x <- lapply(X = x,
                  FUN = trimws)
      #pull out position of non-blank numbers
      num <- which(!unlist(lapply(x, function(x) x=="")))
      x <- x[[1]][x[[1]]!=""]
  
  #create columns for number and its location
  x <- bind_cols("num"=x, "loc"=num)
  
  #new round of cleaning
      #remove pattern "x y"- not clear what it means
      x <- x[!str_detect(x$num, "[:digit:] [:digit:]"),]
      #remove lone hundred, thousand, million without modifier
      x <- x[!trimws(x$num) %in% c("hundred", "thousand", "million"),]
      x <- x[!str_detect(x$num, "^hundred|^thousand|^million"),]
      x$num <- str_replace(x$num, "thousand thousand", "thousand")
      x$num <- str_replace(x$num, "million million", "million")
      #remove blanks
      x <- x[x$num!="",]
      #remove spaces from arabic numerals
      x$num[!grepl("[a-z]", x$num)] <- str_remove_all(x$num[!grepl("[a-z]", x$num)], " ")
  
  # CONVERT 
      #attempt to convert all to numeric
      x$out <- as.numeric(x$num)
      
      #for numbers that can't be directly converted to numeric, run them through word_to_number  function
      if(length(x$out[is.na(x$out)])>=1){
        x$out[is.na(x$out)] <- unlist(lapply(x$num[is.na(x$out)], word_to_number))
      }
      #remove original column
      x$num <- NULL
      #remove NAs
      x <- x[!is.na(x$out),]
  
  #if specifying you want location, return location
  if(ret=="loc"){
    return(x$loc)
  }
  #otherwise, return number
  else(return(x$out))
}

